Audio Embedding¶
  • Audio Preprocessing: Load, process and extract embeddings from various audio samples using an open-source library such as scipy or librosa.
  • Embedding Creation: Convert the audio signals into embeddings using a pre-trained model or a simple RNN-based architecture.
  • Vector Database: Use faiss-cpu to store and search for audio embeddings efficiently.
  • RAG (Retrieval-Augmented Generation): Implement the query mechanism to retrieve the most similar audio embeddings.
  • Visualization: Display tables for audio metadata and embeddings, along with graphs for original and predicted audio 5.0 | 0.167
In [ ]:
%pip install -q numpy matplotlib scipy faiss-cpu librosa pandas
Python interpreter will be restarted.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
petastorm 0.11.4 requires pyspark>=2.1.0, which is not installed.
pandas-profiling 3.1.0 requires joblib~=1.0.1, but you have joblib 1.4.2 which is incompatible.
mleap 0.20.0 requires scikit-learn<0.23.0,>=0.22.0, but you have scikit-learn 1.6.1 which is incompatible.
Python interpreter will be restarted.
In [ ]:
import os
import numpy as np
import matplotlib.pyplot as plt
import librosa
import faiss
import pandas as pd
from scipy.signal import spectrogram

# Path to the audio files directory
audio_directory = "/curated/AudioStore/audios/"

# Function to get list of audio files in the directory
def get_audio_files(directory):
    return [f for f in os.listdir(directory) if f.endswith(".mp3")]

# Function to extract spectrogram as embeddings
def extract_spectrogram_embedding(audio_file):
    # Load audio file using librosa
    y, sr = librosa.load(audio_file, sr=None)
    
    # Compute the spectrogram
    f, t, Sxx = spectrogram(y, sr)
    
    # Convert to log scale for better feature representation
    Sxx = np.log(Sxx + 1e-7)
    
    # Take mean across time dimension to reduce the dimensionality
    embedding = np.mean(Sxx, axis=1)
    return embedding

# Get all audio files from the specified directory
audio_files = get_audio_files(audio_directory)

# Extract embeddings from each audio file
embeddings = [extract_spectrogram_embedding(os.path.join(audio_directory, file)) for file in audio_files]

# Create FAISS index to store and retrieve embeddings
def create_faiss_index(embeddings):
    embeddings = np.array(embeddings).astype('float32')
    dim = embeddings.shape[1]  # The dimensionality of the embeddings
    index = faiss.IndexFlatL2(dim)  # Using L2 distance metric
    index.add(embeddings)  # Add embeddings to the index
    return index

# Create the FAISS index
faiss_index = create_faiss_index(embeddings)

# Function to retrieve similar audio based on a query
def retrieve_similar_audio(query_file, faiss_index, k=2):
    query_embedding = extract_spectrogram_embedding(query_file)
    query_embedding = np.array(query_embedding).astype('float32').reshape(1, -1)
    distances, indices = faiss_index.search(query_embedding, k)  # Retrieve top-k nearest neighbors
    return indices, distances

# Query example: Assume the user asks for a specific audio file (e.g., "dog_bark.mp3")
query_audio_file = os.path.join(audio_directory, "duskwolf_101348.mp3")  
indices, distances = retrieve_similar_audio(query_audio_file, faiss_index, k=2)

# Function to display audio table in markdown format
def display_audio_table(data, title):
    df = pd.DataFrame(data, columns=['Index', 'Audio', 'Embedding (First 5 embeddings)'])
    print(f"\n{title}")
    print(df.to_markdown(index=False))  # Use markdown to get nice table formatting

# Display the results in a table
def display_results(audio_files, embeddings, indices, query_audio_file):
    #print(f"\nQuery Audio File: {query_audio_file}")
    
    # Prepare data for original audio table (first 5 embeddings)
    original_audio_data = [(i, audio_files[i], embeddings[i][:5]) for i in range(len(audio_files))]
    display_audio_table(original_audio_data, "Original Audio Files and Embeddings:")
    
    # Display User Query Audio Waveform
    plot_query_audio_waveform(query_audio_file)

    # Prepare data for predicted audio table (first 5 embeddings)
    predicted_audio_files = [audio_files[idx] for idx in indices[0]]
    predicted_embeddings = [embeddings[idx] for idx in indices[0]]
    predicted_audio_data = [(i, predicted_audio_files[i], predicted_embeddings[i][:5]) for i in range(len(predicted_audio_files))]
    display_audio_table(predicted_audio_data, "Predicted Audio Files and Embeddings:")

    # Display Predicted Audio Waveforms side by side
    plot_predicted_audio_waveforms(audio_files, indices)

# Plot the User Query Audio Waveform
def plot_query_audio_waveform(query_audio_file):
    # Extract just the file name (no path)
    query_audio_name = os.path.basename(query_audio_file)
    
    # Plot the query audio waveform
    y, sr = librosa.load(query_audio_file, sr=None)
    plt.figure(figsize=(6, 4))  # Adjust figure size
    plt.plot(y)
    plt.title(f"User Query Audio Waveform: {query_audio_name}")  # Use only file name
    plt.xlabel("Sample Number")
    plt.ylabel("Amplitude")
    plt.show()

# Plot the Predicted Audio Waveforms side by side
def plot_predicted_audio_waveforms(audio_files, indices):
    # Plot predicted audio waveforms side by side
    fig, axes = plt.subplots(1, len(indices[0]), figsize=(12, 4))  # Adjust the number of subplots based on k (number of predicted files)
    if len(indices[0]) == 1:
        axes = [axes]  # To handle the case where there's only one predicted audio

    for i, idx in enumerate(indices[0]):
        retrieved_audio_file = audio_files[idx]
        retrieved_audio_name = os.path.basename(retrieved_audio_file)  # Extract file name only
        y, sr = librosa.load(os.path.join(audio_directory, retrieved_audio_file), sr=None)
        axes[i].plot(y)
        axes[i].set_title(f"Predicted Audio Waveform: {retrieved_audio_name}")  # Use only file name
        axes[i].set_xlabel("Sample Number")
        axes[i].set_ylabel("Amplitude")
    plt.tight_layout()  # To ensure that the subplots don't overlap
    plt.show()

# Display results
display_results(audio_files, embeddings, indices, query_audio_file)

Original Audio Files and Embeddings:
|   Index | Audio                              | Embedding (First 5 embeddings)                                |
|--------:|:-----------------------------------|:--------------------------------------------------------------|
|       0 | diesel_mercedes_190_d_33940.mp3    | [-15.053708 -12.10706  -12.368397 -13.738781 -14.303338]      |
|       1 | dog1_small_barking_angirly.mp3     | [-15.80812  -15.75914  -15.568337 -15.184358 -14.459221]      |
|       2 | dog2_small_dog_barking.mp3         | [-15.837855 -15.630339 -15.422517 -14.892724 -15.095776]      |
|       3 | dog3_barking.mp3                   | [-15.591373 -15.58906  -15.320949 -14.866169 -14.144432]      |
|       4 | duskwolf_101348.mp3                | [-15.414704 -15.892582 -15.705197 -15.354351 -14.447784]      |
|       5 | truck1_Hyundai_Tractor_Engine.mp3  | [-13.268114  -9.427747 -11.069034 -11.546685  -9.743669]      |
|       6 | truck2_Dododge.mp3                 | [-15.470654 -12.924455 -12.648085 -13.403902 -14.288434]      |
|       7 | truck_diesel_07dodge_rev_98278.mp3 | [-15.307348 -13.277036 -12.684735 -13.181681 -14.059005]      |
|       8 | wol3_wolves.mp3                    | [-16.056688 -16.060814 -15.998393 -15.741171 -15.141277]      |
|       9 | wolf1_howling.mp3                  | [-15.400748 -14.87836  -13.977464 -13.397399 -15.081363]      |
|      10 | wolf2_howling_wolves.mp3           | [-15.7981415 -15.991037  -15.964015  -15.954524  -15.771357 ] |
No description has been provided for this image
Predicted Audio Files and Embeddings:
|   Index | Audio               | Embedding (First 5 embeddings)                           |
|--------:|:--------------------|:---------------------------------------------------------|
|       0 | duskwolf_101348.mp3 | [-15.414704 -15.892582 -15.705197 -15.354351 -14.447784] |
|       1 | dog3_barking.mp3    | [-15.591373 -15.58906  -15.320949 -14.866169 -14.144432] |
No description has been provided for this image